{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 爬取新浪网新闻信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "import json\n",
    "res = requests.get('http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gjxw&level==1||=2&show_ext=1&show_all=1&show_num=500&tag=1&format=json&page=1&callback=newsloadercallback&_=1496538497721')\n",
    "#res.text\n",
    "jd = res.text.lstrip('  newsloadercallback(').rstrip(');')\n",
    "data = json.loads(jd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 建立新闻内文抓取函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "def getArticle(url):\n",
    "    res = requests.get(url)\n",
    "    res.encoding = 'utf-8'\n",
    "    soup = BeautifulSoup(res.text, 'html.parser')\n",
    "    return ' '.join([p.text.strip() for p in soup.select('#artibody p')])\n",
    "\n",
    "#getArticle('http://news.sina.com.cn/o/2017-06-04/doc-ifyfuvpm7356166.shtml')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 抓取新闻资料"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "ary = []\n",
    "for rec in data['result']['data']:\n",
    "    try:\n",
    "        ary.append({'title':rec['ext3'], 'url':rec['url'], 'content': getArticle(rec['url'])})\n",
    "    except:\n",
    "        print(rec['ext3'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 储存新闻数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas\n",
    "df = pandas.DataFrame(ary)\n",
    "df.to_excel('news.xlsx')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取新闻数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>content</th>\n",
       "      <th>title</th>\n",
       "      <th>url</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>英国《每日电讯报》记者Harry Yorke 称，伦敦桥附近再次传出巨大爆炸声，爆炸强度是半...</td>\n",
       "      <td>伦敦桥附近再传爆炸声</td>\n",
       "      <td>http://news.sina.com.cn/w/zx/2017-06-04/doc-if...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[环球网综合报道]据英国《每日邮报》6月1日报道，近日，美国洛杉矶市比弗利山庄罗迪欧大道诊所...</td>\n",
       "      <td>美诊所1.5万人整形记录被盗</td>\n",
       "      <td>http://news.sina.com.cn/w/zx/2017-06-04/doc-if...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[环球网综合报道]据美国有线电视新闻网4日报道，当地时间3日，英国伦敦相继发生三起袭击事件，...</td>\n",
       "      <td>特朗普“发推”声援伦敦</td>\n",
       "      <td>http://news.sina.com.cn/o/2017-06-04/doc-ifyfu...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>原标题：军情锐评 | 文在寅为何成“挺萨德派”？俄军对付美军有一套…… 近日，美国展示拦截洲...</td>\n",
       "      <td>文在寅为何成“挺萨德派”</td>\n",
       "      <td>http://news.sina.com.cn/w/zx/2017-06-04/doc-if...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>中新网6月4日电 据外媒报道，在伦敦桥和巴罗市场3日遭恐袭后，有目击者称，在巴罗市场连续传来...</td>\n",
       "      <td>2名疑似袭击者被击毙</td>\n",
       "      <td>http://news.sina.com.cn/o/2017-06-04/doc-ifyfu...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             content           title  \\\n",
       "0  英国《每日电讯报》记者Harry Yorke 称，伦敦桥附近再次传出巨大爆炸声，爆炸强度是半...      伦敦桥附近再传爆炸声   \n",
       "1  [环球网综合报道]据英国《每日邮报》6月1日报道，近日，美国洛杉矶市比弗利山庄罗迪欧大道诊所...  美诊所1.5万人整形记录被盗   \n",
       "2  [环球网综合报道]据美国有线电视新闻网4日报道，当地时间3日，英国伦敦相继发生三起袭击事件，...     特朗普“发推”声援伦敦   \n",
       "3  原标题：军情锐评 | 文在寅为何成“挺萨德派”？俄军对付美军有一套…… 近日，美国展示拦截洲...    文在寅为何成“挺萨德派”   \n",
       "4  中新网6月4日电 据外媒报道，在伦敦桥和巴罗市场3日遭恐袭后，有目击者称，在巴罗市场连续传来...      2名疑似袭击者被击毙   \n",
       "\n",
       "                                                 url  \n",
       "0  http://news.sina.com.cn/w/zx/2017-06-04/doc-if...  \n",
       "1  http://news.sina.com.cn/w/zx/2017-06-04/doc-if...  \n",
       "2  http://news.sina.com.cn/o/2017-06-04/doc-ifyfu...  \n",
       "3  http://news.sina.com.cn/w/zx/2017-06-04/doc-if...  \n",
       "4  http://news.sina.com.cn/o/2017-06-04/doc-ifyfu...  "
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas\n",
    "df = pandas.read_excel('Data/news.xlsx') \n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 安裝Jieba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting jieba\n",
      "  Using cached jieba-0.38.zip\n",
      "Building wheels for collected packages: jieba\n",
      "  Running setup.py bdist_wheel for jieba: started\n",
      "  Running setup.py bdist_wheel for jieba: finished with status 'done'\n",
      "  Stored in directory: C:\\Users\\User\\AppData\\Local\\pip\\Cache\\wheels\\fd\\8c\\07\\e495d158d91e11f50c829520f77d20d1f408f18a13cdb15d02\n",
      "Successfully built jieba\n",
      "Installing collected packages: jieba\n",
      "Successfully installed jieba-0.38\n"
     ]
    }
   ],
   "source": [
    "! pip install jieba"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用Jieba 断词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import jieba\n",
    "titles   = []\n",
    "articles = [] \n",
    "for rec in df.iterrows():\n",
    "    #print(rec[1].content)\n",
    "    articles.append(' '.join(jieba.cut(rec[1].content)))\n",
    "    titles.append(rec[1].title)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "498"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(articles)\n",
    "len(titles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'英国 《 每日 电讯报 》 记者 Harry   Yorke   称 ， 伦敦桥 附近 再次 传出 巨大 爆炸声 ， 爆炸 强度 是 半小时 前 传出 的 三起 爆炸声 的 四五倍 。   伦敦 警方 已经 证实 ， 近 45 分钟 内 发生 的 一系列 爆炸 为 “ 受控 爆破 ” （ controlled \\xa0 explosions ） 。 “ 受控 爆破 ” 是 指 引爆 或者 拆除 可疑 爆炸 装置 的 一种 方法 。 （ 央视 记者 \\xa0 张勇 ）   责任编辑 ： 张迪'"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "articles[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 建立词频矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "vectorizer = CountVectorizer() \n",
    "X = vectorizer.fit_transform(articles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(498, 16527)"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 计算余弦距离(Cosine Similarity)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "cosine_similarities  = cosine_similarity(X, X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.        ,  0.03107505,  0.14547859, ...,  0.05066481,\n",
       "         0.1387863 ,  0.01562691],\n",
       "       [ 0.03107505,  1.        ,  0.13350354, ...,  0.07793338,\n",
       "         0.14340031,  0.02294493],\n",
       "       [ 0.14547859,  0.13350354,  1.        , ...,  0.19071546,\n",
       "         0.29149942,  0.01790287],\n",
       "       ..., \n",
       "       [ 0.05066481,  0.07793338,  0.19071546, ...,  1.        ,\n",
       "         0.15117901,  0.08550731],\n",
       "       [ 0.1387863 ,  0.14340031,  0.29149942, ...,  0.15117901,\n",
       "         1.        ,  0.03795401],\n",
       "       [ 0.01562691,  0.02294493,  0.01790287, ...,  0.08550731,\n",
       "         0.03795401,  1.        ]])"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cosine_similarities.shape\n",
    "cosine_similarities"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用KMeans 分群"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "c = KMeans(n_clusters=10, init = 'k-means++', random_state=123)\n",
    "k_data = c.fit_predict(cosine_similarities)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([4, 4, 0, 9, 0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 0, 7, 3, 4, 2, 5, 4, 3, 3,\n",
       "       5, 4, 6, 4, 4, 4, 4, 2, 0, 3, 1, 2, 3, 4, 4, 4, 4, 4, 5, 4, 3, 4, 3,\n",
       "       4, 7, 3, 3, 4, 4, 2, 3, 4, 4, 7, 4, 4, 2, 9, 2, 1, 4, 3, 8, 4, 7, 4,\n",
       "       9, 2, 7, 2, 7, 4, 4, 5, 7, 4, 4, 2, 2, 2, 7, 7, 3, 4, 4, 8, 7, 3, 4,\n",
       "       2, 5, 2, 4, 3, 7, 2, 9, 7, 5, 5, 2, 3, 4, 3, 4, 2, 3, 8, 7, 3, 4, 8,\n",
       "       2, 0, 2, 2, 2, 3, 9, 4, 3, 8, 4, 4, 4, 3, 3, 7, 4, 4, 5, 4, 2, 3, 4,\n",
       "       5, 5, 3, 5, 0, 5, 3, 4, 5, 4, 2, 2, 3, 4, 2, 4, 3, 6, 0, 4, 0, 7, 4,\n",
       "       4, 3, 4, 3, 4, 3, 8, 4, 5, 8, 4, 2, 4, 4, 4, 4, 4, 3, 9, 4, 7, 3, 4,\n",
       "       2, 4, 2, 9, 4, 4, 3, 9, 3, 4, 4, 8, 4, 0, 4, 4, 3, 9, 4, 8, 8, 2, 8,\n",
       "       4, 2, 3, 4, 4, 5, 4, 6, 4, 4, 1, 5, 9, 4, 3, 5, 2, 9, 9, 8, 4, 4, 7,\n",
       "       3, 5, 4, 4, 4, 5, 6, 4, 0, 3, 2, 9, 2, 3, 4, 8, 8, 4, 6, 4, 6, 9, 1,\n",
       "       8, 3, 6, 6, 9, 1, 6, 1, 8, 1, 6, 6, 4, 8, 6, 4, 8, 8, 3, 4, 6, 4, 4,\n",
       "       6, 6, 5, 4, 6, 4, 4, 5, 4, 8, 7, 8, 0, 5, 9, 3, 8, 6, 4, 4, 3, 0, 1,\n",
       "       1, 9, 0, 5, 3, 3, 8, 3, 5, 3, 8, 4, 5, 3, 4, 3, 9, 8, 3, 3, 4, 4, 8,\n",
       "       4, 4, 4, 8, 3, 4, 4, 9, 4, 3, 3, 4, 4, 3, 5, 4, 3, 4, 4, 4, 7, 8, 4,\n",
       "       4, 3, 4, 3, 4, 0, 4, 4, 0, 0, 4, 4, 4, 3, 4, 5, 4, 1, 4, 7, 8, 8, 9,\n",
       "       4, 4, 3, 4, 7, 4, 0, 4, 7, 4, 5, 1, 8, 4, 7, 4, 9, 5, 4, 4, 3, 4, 3,\n",
       "       4, 1, 1, 4, 5, 4, 4, 5, 4, 4, 3, 4, 8, 3, 2, 5, 3, 4, 0, 4, 7, 8, 0,\n",
       "       5, 4, 0, 5, 0, 4, 3, 5, 4, 4, 5, 7, 3, 4, 4, 4, 5, 0, 4, 4, 4, 4, 7,\n",
       "       4, 0, 3, 4, 4, 5, 3, 4, 4, 4, 4, 4, 4, 0, 8, 0, 4, 4, 0, 8, 4, 8, 4,\n",
       "       4, 8, 8, 5, 4, 8, 4, 4, 4, 0, 5, 8, 4, 8, 0, 4, 3, 3, 3, 4, 3, 8, 4,\n",
       "       4, 8, 5, 4, 0, 3, 4, 4, 4, 5, 0, 4, 5, 0, 4])"
      ]
     },
     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "k_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 产生分群结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#k_data == 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['文在寅为何成“挺萨德派”', '美韩同盟裂痕扩大？', '环评分歧或影响萨德部署', '韩强调将落实萨德环评工作',\n",
       "       '韩方漏报萨德意使部署不可逆', '韩防长回应瞒报“萨德”入韩', '韩国女子美使馆前反萨德', '韩青瓦台通知前国防部长受查',\n",
       "       '文在寅:不会改变部署决定', '文在寅:不改变萨德部署决定', '韩媒：韩军或遭大换血', '萨德瞒报丑闻指向韩防长',\n",
       "       '文在寅:不会改变萨德部署决定', '韩防长:未曾下令瞒报萨德入韩', '美方:韩萨德仅有初步拦截能力',\n",
       "       '韩方回应萨德发射车暗中入韩', '韩军瞒报4辆萨德车秘密入境', '萨德发射架新政府成立前入境', '4套萨德发射车“秘密”入韩',\n",
       "       '美媒:萨德成文在寅手中悬案', '外媒:萨德成文在寅烫手山芋'], \n",
       "      dtype='<U16')"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy\n",
    "titles_ary = numpy.array(titles)\n",
    "titles_ary[k_data == 9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
